import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
import ast
import matplotlib.image as mpimg
%matplotlib inline
from wordcloud import WordCloud, STOPWORDS
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython import display
import matplotlib.image as mpimg


from pathlib import Path
gifPath = Path("D:\\giphy (3).gif")
with open(gifPath,'rb') as f:
    display.Image(data=f.read(), format='png')


meta = pd.read_csv('D:\\movies_metadata.csv')
meta.head()

D:\anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3444: DtypeWarning: Columns (10) have mixed types.Specify dtype option on import or set low_memory=False.
  exec(code_obj, self.user_global_ns, self.user_ns)


crew = pd.read_csv('D:\\credits.csv')
crew.head()


meta.id.drop_duplicates(keep = 'first', inplace=True)
crew.id.drop_duplicates(keep = 'first', inplace=True)


meta = meta.drop(meta[meta.original_language.str.count('\d') != 0].index)
crew = crew.drop(meta[meta.original_language.str.count('\d') != 0].index)


meta.id = meta.id.astype(int)


movies = meta.merge(crew, on='id')
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45527 entries, 0 to 45526
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45527 non-null  object 
 1   belongs_to_collection  4500 non-null   object 
 2   budget                 45527 non-null  object 
 3   genres                 45527 non-null  object 
 4   homepage               7792 non-null   object 
 5   id                     45527 non-null  int32  
 6   imdb_id                45510 non-null  object 
 7   original_language      45527 non-null  object 
 8   original_title         45527 non-null  object 
 9   overview               44574 non-null  object 
 10  popularity             45524 non-null  object 
 11  poster_path            45141 non-null  object 
 12  production_companies   45524 non-null  object 
 13  production_countries   45524 non-null  object 
 14  release_date           45440 non-null  object 
 15  revenue                45524 non-null  float64
 16  runtime                45267 non-null  float64
 17  spoken_languages       45524 non-null  object 
 18  status                 45443 non-null  object 
 19  tagline                20439 non-null  object 
 20  title                  45524 non-null  object 
 21  video                  45524 non-null  object 
 22  vote_average           45524 non-null  float64
 23  vote_count             45524 non-null  float64
 24  cast                   45527 non-null  object 
 25  crew                   45527 non-null  object 
dtypes: float64(4), int32(1), object(21)
memory usage: 9.2+ MB


movies_clean = movies.copy() # išsaugome sujungtą lentelę


movies_clean = movies_clean.drop(['imdb_id', 'tagline', 'video', 'homepage', 'adult','poster_path','belongs_to_collection','overview'], axis=1)
movies_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45527 entries, 0 to 45526
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                45527 non-null  object 
 1   genres                45527 non-null  object 
 2   id                    45527 non-null  int32  
 3   original_language     45527 non-null  object 
 4   original_title        45527 non-null  object 
 5   popularity            45524 non-null  object 
 6   production_companies  45524 non-null  object 
 7   production_countries  45524 non-null  object 
 8   release_date          45440 non-null  object 
 9   revenue               45524 non-null  float64
 10  runtime               45267 non-null  float64
 11  spoken_languages      45524 non-null  object 
 12  status                45443 non-null  object 
 13  title                 45524 non-null  object 
 14  vote_average          45524 non-null  float64
 15  vote_count            45524 non-null  float64
 16  cast                  45527 non-null  object 
 17  crew                  45527 non-null  object 
dtypes: float64(4), int32(1), object(13)
memory usage: 6.4+ MB


movies_clean.drop(movies_clean[movies_clean['status'] != 'Released'].index, inplace=True) # Panaikinam filmus, kurių statusas yra kitoks nei 'released'


movies_clean = movies_clean.drop(['status'], axis=1) # Kadangi visi filmai yra su statusu 'released' panaikinam visai šį stulpelį


zero_runtime = movies_clean[movies_clean['runtime'] == 0]  # Panaikinam visus filmus kuriu runtime ir vote count yra 0
movies_clean.drop(zero_runtime.index, inplace=True)

zero_vote = movies_clean[movies_clean['vote_count'] == 0]
movies_clean.drop(zero_vote.index, inplace=True)


movies_clean[['popularity']]=movies_clean[['popularity']].astype(str).astype(float)  # Pakeičiam duomenų tipus
movies_clean[['budget']]=movies_clean[['budget']].astype(str).astype(float)
movies_clean['release_date']=pd.to_datetime(movies_clean['release_date'])

movies_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41065 entries, 0 to 45524
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   budget                41065 non-null  float64       
 1   genres                41065 non-null  object        
 2   id                    41065 non-null  int32         
 3   original_language     41065 non-null  object        
 4   original_title        41065 non-null  object        
 5   popularity            41065 non-null  float64       
 6   production_companies  41065 non-null  object        
 7   production_countries  41065 non-null  object        
 8   release_date          41041 non-null  datetime64[ns]
 9   revenue               41065 non-null  float64       
 10  runtime               40911 non-null  float64       
 11  spoken_languages      41065 non-null  object        
 12  title                 41065 non-null  object        
 13  vote_average          41065 non-null  float64       
 14  vote_count            41065 non-null  float64       
 15  cast                  41065 non-null  object        
 16  crew                  41065 non-null  object        
dtypes: datetime64[ns](1), float64(6), int32(1), object(9)
memory usage: 5.5+ MB


movies_clean[['title', 'genres', 'production_companies', 'production_countries', 'spoken_languages']].head(3)


# Pasiimam iš stulpelių production_countries,production_companies,spoken_languages ir genres  tik tą informaciją kuri mums bus reikalinga


movies_clean['production_countries']=movies_clean.loc[movies_clean['production_countries'].notna(), 'production_countries'].str.split("'name': ").str[1].str.split("'").str[1]


movies_clean['production_companies']=movies_clean.loc[movies_clean['production_companies'].notna(), 'production_companies'].str.split("'name': ").str[1].str.split("'").str[1]


movies_clean['spoken_languages']=movies_clean.loc[movies_clean['spoken_languages'].notna(), 'spoken_languages'].str.split("'name': ").str[1].str.split("'").str[1]


movies_clean['genres']=movies_clean.loc[movies_clean['genres'].notna(), 'genres'].str.split("'name': ").str[1].str.split("'").str[1]


movies_clean['year'] = movies_clean['release_date'].dt.year
movies_clean['year'].value_counts()

2014.0    1780
2013.0    1759
2015.0    1699
2012.0    1576
2011.0    1525
          ... 
1878.0       1
1874.0       1
1887.0       1
1883.0       1
1893.0       1
Name: year, Length: 133, dtype: int64


movies_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41065 entries, 0 to 45524
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   budget                41065 non-null  float64       
 1   genres                39663 non-null  object        
 2   id                    41065 non-null  int32         
 3   original_language     41065 non-null  object        
 4   original_title        41065 non-null  object        
 5   popularity            41065 non-null  float64       
 6   production_companies  31747 non-null  object        
 7   production_countries  36536 non-null  object        
 8   release_date          41041 non-null  datetime64[ns]
 9   revenue               41065 non-null  float64       
 10  runtime               40911 non-null  float64       
 11  spoken_languages      38584 non-null  object        
 12  title                 41065 non-null  object        
 13  vote_average          41065 non-null  float64       
 14  vote_count            41065 non-null  float64       
 15  cast                  41065 non-null  object        
 16  crew                  41065 non-null  object        
 17  year                  41041 non-null  float64       
dtypes: datetime64[ns](1), float64(7), int32(1), object(9)
memory usage: 5.8+ MB


movies_clean = movies_clean.dropna(how = 'any', subset = ['genres'])


movies_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39663 entries, 0 to 45524
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   budget                39663 non-null  float64       
 1   genres                39663 non-null  object        
 2   id                    39663 non-null  int32         
 3   original_language     39663 non-null  object        
 4   original_title        39663 non-null  object        
 5   popularity            39663 non-null  float64       
 6   production_companies  31502 non-null  object        
 7   production_countries  35944 non-null  object        
 8   release_date          39654 non-null  datetime64[ns]
 9   revenue               39663 non-null  float64       
 10  runtime               39546 non-null  float64       
 11  spoken_languages      37734 non-null  object        
 12  title                 39663 non-null  object        
 13  vote_average          39663 non-null  float64       
 14  vote_count            39663 non-null  float64       
 15  cast                  39663 non-null  object        
 16  crew                  39663 non-null  object        
 17  year                  39654 non-null  float64       
dtypes: datetime64[ns](1), float64(7), int32(1), object(9)
memory usage: 5.6+ MB


movies_clean = movies_clean.dropna(how = 'any', subset = ['production_companies','production_countries'])


movies_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31079 entries, 0 to 45524
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   budget                31079 non-null  float64       
 1   genres                31079 non-null  object        
 2   id                    31079 non-null  int32         
 3   original_language     31079 non-null  object        
 4   original_title        31079 non-null  object        
 5   popularity            31079 non-null  float64       
 6   production_companies  31079 non-null  object        
 7   production_countries  31079 non-null  object        
 8   release_date          31078 non-null  datetime64[ns]
 9   revenue               31079 non-null  float64       
 10  runtime               31029 non-null  float64       
 11  spoken_languages      30568 non-null  object        
 12  title                 31079 non-null  object        
 13  vote_average          31079 non-null  float64       
 14  vote_count            31079 non-null  float64       
 15  cast                  31079 non-null  object        
 16  crew                  31079 non-null  object        
 17  year                  31078 non-null  float64       
dtypes: datetime64[ns](1), float64(7), int32(1), object(9)
memory usage: 4.4+ MB


movies_clean = movies_clean.dropna(how = 'any', subset = ['release_date','runtime','spoken_languages','release_date','year'  ])


movies_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30528 entries, 0 to 45524
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   budget                30528 non-null  float64       
 1   genres                30528 non-null  object        
 2   id                    30528 non-null  int32         
 3   original_language     30528 non-null  object        
 4   original_title        30528 non-null  object        
 5   popularity            30528 non-null  float64       
 6   production_companies  30528 non-null  object        
 7   production_countries  30528 non-null  object        
 8   release_date          30528 non-null  datetime64[ns]
 9   revenue               30528 non-null  float64       
 10  runtime               30528 non-null  float64       
 11  spoken_languages      30528 non-null  object        
 12  title                 30528 non-null  object        
 13  vote_average          30528 non-null  float64       
 14  vote_count            30528 non-null  float64       
 15  cast                  30528 non-null  object        
 16  crew                  30528 non-null  object        
 17  year                  30528 non-null  float64       
dtypes: datetime64[ns](1), float64(7), int32(1), object(9)
memory usage: 4.3+ MB


movies_clean['year']=movies_clean[['year']].astype(float).astype(int)


movies_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30528 entries, 0 to 45524
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   budget                30528 non-null  float64       
 1   genres                30528 non-null  object        
 2   id                    30528 non-null  int32         
 3   original_language     30528 non-null  object        
 4   original_title        30528 non-null  object        
 5   popularity            30528 non-null  float64       
 6   production_companies  30528 non-null  object        
 7   production_countries  30528 non-null  object        
 8   release_date          30528 non-null  datetime64[ns]
 9   revenue               30528 non-null  float64       
 10  runtime               30528 non-null  float64       
 11  spoken_languages      30528 non-null  object        
 12  title                 30528 non-null  object        
 13  vote_average          30528 non-null  float64       
 14  vote_count            30528 non-null  float64       
 15  cast                  30528 non-null  object        
 16  crew                  30528 non-null  object        
 17  year                  30528 non-null  int32         
dtypes: datetime64[ns](1), float64(6), int32(2), object(9)
memory usage: 4.2+ MB


def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan


movies_clean['director'] = movies_clean['crew'].apply(ast.literal_eval).apply(get_director)
movies_clean['director']

0          John Lasseter
1           Joe Johnston
2          Howard Deutch
3        Forest Whitaker
4          Charles Shyer
              ...       
45519           Ben Rock
45520      Aaron Osborne
45521         John Irvin
45523           Lav Diaz
45524     Mark L. Lester
Name: director, Length: 30528, dtype: object


def get_director_gender(x):
    for i in x:
        if i['job'] == 'Director':
            if i['gender'] == 1:
                return "female"
            elif i['gender'] == 2:
                return "male"
            else:
                return "unknown"
    return np.nan


movies_clean['director_gender'] = movies_clean['crew'].apply(ast.literal_eval).apply(get_director_gender)
gender= pd.DataFrame(movies_clean['director_gender'].value_counts())
gender


movies_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30439 entries, 0 to 45524
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   budget                30439 non-null  float64       
 1   genres                30439 non-null  object        
 2   id                    30439 non-null  int32         
 3   original_language     30439 non-null  object        
 4   original_title        30439 non-null  object        
 5   popularity            30439 non-null  float64       
 6   production_companies  30439 non-null  object        
 7   production_countries  30439 non-null  object        
 8   release_date          30439 non-null  datetime64[ns]
 9   revenue               30439 non-null  float64       
 10  runtime               30439 non-null  float64       
 11  spoken_languages      30439 non-null  object        
 12  title                 30439 non-null  object        
 13  vote_average          30439 non-null  float64       
 14  vote_count            30439 non-null  float64       
 15  cast                  30439 non-null  object        
 16  crew                  30439 non-null  object        
 17  year                  30439 non-null  int32         
 18  director              30439 non-null  object        
 19  director_gender       30439 non-null  object        
dtypes: datetime64[ns](1), float64(6), int32(2), object(11)
memory usage: 5.7+ MB


movies_clean = movies_clean.dropna(how = 'any', subset = ['director','director_gender'])
movies_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30439 entries, 0 to 45524
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   budget                30439 non-null  float64       
 1   genres                30439 non-null  object        
 2   id                    30439 non-null  int32         
 3   original_language     30439 non-null  object        
 4   original_title        30439 non-null  object        
 5   popularity            30439 non-null  float64       
 6   production_companies  30439 non-null  object        
 7   production_countries  30439 non-null  object        
 8   release_date          30439 non-null  datetime64[ns]
 9   revenue               30439 non-null  float64       
 10  runtime               30439 non-null  float64       
 11  spoken_languages      30439 non-null  object        
 12  title                 30439 non-null  object        
 13  vote_average          30439 non-null  float64       
 14  vote_count            30439 non-null  float64       
 15  cast                  30439 non-null  object        
 16  crew                  30439 non-null  object        
 17  year                  30439 non-null  int32         
 18  director              30439 non-null  object        
 19  director_gender       30439 non-null  object        
dtypes: datetime64[ns](1), float64(6), int32(2), object(11)
memory usage: 4.6+ MB


stulpeliai = ['title', 'budget']
biudžetas = movies_clean.sort_values('budget', ascending=False)[stulpeliai].set_index('title')
top_10_biudžetas = biudžetas.head(10)

fig, ax = plt.subplots(figsize=(15,5))
sns.set_style('white')
sns.barplot(data=top_10_biudžetas, x=top_10_biudžetas.index, y='budget');
plt.xticks(ha='left', rotation=-20, fontsize=15); plt.yticks(fontsize=15)
plt.xlabel(''); plt.ylabel('USD 100 Million', fontsize=15 );
plt.title('Top 10 didžiausio biudžeto filmai', fontsize=15, weight = 'bold' );

stulpeliai = ['title', 'revenue']
uždarbis = movies_clean.sort_values('revenue', ascending=False)[stulpeliai].set_index('title')
top_10_uždarbis = uždarbis.head(10)

fig, ax = plt.subplots(figsize=(15,5))
sns.set_style('white')
sns.barplot(data=top_10_uždarbis, x=top_10_uždarbis.index, y='revenue');
plt.xticks(ha='left', rotation=-20, fontsize=15); plt.yticks(fontsize=15)
plt.xlabel(''); plt.ylabel('USD Billion', fontsize=15);
plt.title('Top 10 daugiausiai uždirbę filmai', fontsize=15, weight = 'bold' );

pelnas = movies_clean['revenue'] - movies_clean['budget']
pelnas.name = 'profit'
pelnas = movies_clean.join(pelnas)[['title', 'budget', 'revenue', 'profit']].sort_values('profit', ascending=False)
top_10_pelnas = pelnas.head(10).set_index('title')

pelnas_procentais = (top_10_pelnas['profit'] / top_10_pelnas['budget'] * 100)
pelnas_procentais = pelnas_procentais.sort_values(ascending=False).to_frame().rename(columns={0:'Pelnas procentais'})

fig, ax = plt.subplots(figsize=(15,5))
sns.set_style('white')
sns.barplot(data=pelnas_procentais, x=pelnas_procentais.index, y='Pelnas procentais')
plt.xticks(ha='left', rotation=-20, fontsize=15); plt.yticks(fontsize=15)
plt.xlabel(''); plt.ylabel('Pelnas procentais', fontsize=15);
plt.title('Pelnas procentais Top 10 pelningiausių filmų', fontsize=15, weight = 'bold');


movies_clean[movies_clean['title']== 'Avatar']


top_10_nuostolis = pelnas[pelnas['revenue'] > 0].tail(10).sort_values(['profit', 'revenue']).set_index('title')

plt.style.use('seaborn')
top_10_nuostolis.plot(kind='bar', figsize=(20,6), fontsize=20)
plt.ylabel('USD Billion', fontsize=20); plt.xlabel('')
plt.xticks(rotation=-20, ha='left')
plt.suptitle('Top 10 nuostolingiausių filmų', fontsize=20)
plt.legend(fontsize=15);


gifPath = Path("D:\\johnny-depp-the-lone-ranger.gif")
with open(gifPath,'rb') as f:
    display.Image(data=f.read(), format='png')


movies_clean[movies_clean['title']== 'The Lone Ranger']


plt.figure(figsize=[10,6])
top_dir = movies_clean.director.value_counts().head(10).index
count = movies_clean.director.value_counts().head(10)

plt.barh(top_dir, count, color='#444444')

plt.title('Top 10 režisieriai')
plt.xlabel('Filmų skaičius');


plt.figure(figsize=[10,6])
zanras = movies_clean.genres.value_counts().index
count = movies_clean.genres.value_counts()
plt.barh(zanras, count, color='#444444')

plt.title('Žanrai')
plt.xlabel('Filmų skaičius');


original_corpus = ' '.join(movies_clean['title'])

title_wordcloud = WordCloud(stopwords=STOPWORDS, background_color='black', height=2000, width=4000).generate(original_corpus)
plt.figure(figsize=(20,8))
plt.imshow(title_wordcloud)
plt.axis('off')
plt.show()

<Figure size 1440x576 with 0 Axes>

<matplotlib.image.AxesImage at 0x1c7a9d5a430>

(-0.5, 3999.5, 1999.5, -0.5)


movies_clean[movies_clean['vote_count'] > 5000][['title', 'vote_average', 'vote_count' ,'year', 'popularity']].sort_values('vote_average', ascending=False).head(10)


vote_avg=np.arange(0,10+0.5,0.5)
plt.hist(data=movies_clean, x='vote_average',bins=vote_avg);
plt.title('Balsu vidurkis');


movies_clean.plot.scatter(x='year', y = 'runtime', color=base_color)

<AxesSubplot:xlabel='year', ylabel='runtime'>


movies_clean[movies_clean.runtime > 800]


gender= pd.DataFrame(movies_clean['director_gender'].value_counts())
gender


sns.catplot(x="year", hue="director_gender", col="director_gender", data=movies_clean, kind="strip", legend=True);


movies_clean[movies_clean['director_gender']== 'female'][['title', 'vote_count', 'vote_average', 'director_gender', 'director', 'popularity','year']].sort_values('popularity', ascending=False).head(10)


gifPath = Path("D:\\giphy (5).gif")
with open(gifPath,'rb') as f:
    display.Image(data=f.read(), format='png')

	adult	belongs_to_collection	budget	genres	homepage	id	imdb_id	original_language	original_title	overview	...	release_date	revenue	runtime	spoken_languages	status	tagline	title	video	vote_average	vote_count
0	False	{'id': 10194, 'name': 'Toy Story Collection', ...	30000000	[{'id': 16, 'name': 'Animation'}, {'id': 35, '...	http://toystory.disney.com/toy-story	862	tt0114709	en	Toy Story	Led by Woody, Andy's toys live happily in his ...	...	1995-10-30	373554033.0	81.0	[{'iso_639_1': 'en', 'name': 'English'}]	Released	NaN	Toy Story	False	7.7	5415.0
1	False	NaN	65000000	[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...	NaN	8844	tt0113497	en	Jumanji	When siblings Judy and Peter discover an encha...	...	1995-12-15	262797249.0	104.0	[{'iso_639_1': 'en', 'name': 'English'}, {'iso...	Released	Roll the dice and unleash the excitement!	Jumanji	False	6.9	2413.0
2	False	{'id': 119050, 'name': 'Grumpy Old Men Collect...	0	[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...	NaN	15602	tt0113228	en	Grumpier Old Men	A family wedding reignites the ancient feud be...	...	1995-12-22	0.0	101.0	[{'iso_639_1': 'en', 'name': 'English'}]	Released	Still Yelling. Still Fighting. Still Ready for...	Grumpier Old Men	False	6.5	92.0
3	False	NaN	16000000	[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...	NaN	31357	tt0114885	en	Waiting to Exhale	Cheated on, mistreated and stepped on, the wom...	...	1995-12-22	81452156.0	127.0	[{'iso_639_1': 'en', 'name': 'English'}]	Released	Friends are the people who let you be yourself...	Waiting to Exhale	False	6.1	34.0
4	False	{'id': 96871, 'name': 'Father of the Bride Col...	0	[{'id': 35, 'name': 'Comedy'}]	NaN	11862	tt0113041	en	Father of the Bride Part II	Just when George Banks has recovered from his ...	...	1995-02-10	76578911.0	106.0	[{'iso_639_1': 'en', 'name': 'English'}]	Released	Just When His World Is Back To Normal... He's ...	Father of the Bride Part II	False	5.7	173.0

	cast	crew	id
0	[{'cast_id': 14, 'character': 'Woody (voice)',...	[{'credit_id': '52fe4284c3a36847f8024f49', 'de...	862
1	[{'cast_id': 1, 'character': 'Alan Parrish', '...	[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...	8844
2	[{'cast_id': 2, 'character': 'Max Goldman', 'c...	[{'credit_id': '52fe466a9251416c75077a89', 'de...	15602
3	[{'cast_id': 1, 'character': "Savannah 'Vannah...	[{'credit_id': '52fe44779251416c91011acb', 'de...	31357
4	[{'cast_id': 1, 'character': 'George Banks', '...	[{'credit_id': '52fe44959251416c75039ed7', 'de...	11862

	title	genres	production_companies	production_countries	spoken_languages
0	Toy Story	[{'id': 16, 'name': 'Animation'}, {'id': 35, '...	[{'name': 'Pixar Animation Studios', 'id': 3}]	[{'iso_3166_1': 'US', 'name': 'United States o...	[{'iso_639_1': 'en', 'name': 'English'}]
1	Jumanji	[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...	[{'name': 'TriStar Pictures', 'id': 559}, {'na...	[{'iso_3166_1': 'US', 'name': 'United States o...	[{'iso_639_1': 'en', 'name': 'English'}, {'iso...
2	Grumpier Old Men	[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...	[{'name': 'Warner Bros.', 'id': 6194}, {'name'...	[{'iso_3166_1': 'US', 'name': 'United States o...	[{'iso_639_1': 'en', 'name': 'English'}]

	budget	genres	id	original_language	original_title	popularity	production_companies	production_countries	release_date	revenue	runtime	spoken_languages	title	vote_average	vote_count	cast	crew	year	director	director_gender
5029	0.0	Adventure	108266	en	The Lone Ranger	0.889869	Wrather Productions	United States of America	1956-02-25	0.0	86.0	English	The Lone Ranger	6.1	7.0	[{'cast_id': 2, 'character': 'The Lone Ranger'...	[{'credit_id': '52fe4a94c3a36847f81d632b', 'de...	1956	Stuart Heisler	male
21247	255000000.0	Action	57201	en	The Lone Ranger	12.729104	Walt Disney Pictures	United States of America	2013-07-03	89289910.0	149.0	English	The Lone Ranger	5.9	2361.0	[{'cast_id': 4, 'character': 'Tonto', 'credit_...	[{'credit_id': '52fe4928c3a36847f818be95', 'de...	2013	Gore Verbinski	male

	title	vote_average	vote_count	year	popularity
314	The Shawshank Redemption	8.5	8358.0	1994	51.645403
837	The Godfather	8.5	6024.0	1972	41.109264
292	Pulp Fiction	8.3	8670.0	1994	140.950236
12525	The Dark Knight	8.3	12269.0	2008	123.167259
2854	Fight Club	8.3	9678.0	1999	63.869599
18537	The Intouchables	8.2	5410.0	2011	16.086919
351	Forrest Gump	8.2	8147.0	1994	48.307194
1163	The Empire Strikes Back	8.2	5998.0	1980	19.470959
256	Star Wars	8.1	6778.0	1977	42.149697
46	Se7en	8.1	5915.0	1995	18.457430

Kino industrijos analizė¶

Duomenų rinkinio apžvalga¶

Pagrindiniai iškelti klausimai¶

Duomenų valymas ir tvarkymas¶

1. Kurie filmai daugiausiai uždirbo? Didžiausio biudžeto filmai?¶

Kurie filmai buvo nuostolingi?¶

Kurie režisieriai yra sukūrę daugiausiai filmų?¶

Kokio žanro filmų yra kuriama daugiausiai?¶

Kokie žodžiai yra dažniausiai pasikartojantys filmų pavadinimuose?¶

Kurie filmai TMDB bendruomenės įvertinti geriausiai?¶

Kaip kito filmų ilgis bėgant metams?¶

Kino pramone valdo vyrai?¶

	genres	id	original_language	original_title	popularity	production_companies	production_countries	release_date	runtime	spoken_languages	title	vote_average	vote_count	cast	crew	year	director	director_gender
13824	Drama	45560	de	Berlin Alexanderplatz	2.255785	Bavaria Film	Germany	1980-08-28	931.0	Deutsch	Berlin Alexanderplatz	8.4	5.0	[{'cast_id': 2, 'character': 'Franz Biberkopf'...	[{'credit_id': '5653d1dac3a36850fc001c27', 'de...	1980	Rainer Werner Fassbinder	male
19230	Documentary	208988	en	The War	0.487489	PBS	United States of America	2007-09-23	874.0	English	The War	5.3	3.0	[{'cast_id': 3, 'character': 'Narrator', 'cred...	[{'credit_id': '52fe4d58c3a368484e1e43a5', 'de...	2007	Ken Burns	male
26763	Science Fiction	150004	en	Taken	10.628230	DreamWorks SKG	United States of America	2002-12-02	877.0	English	Taken	7.4	76.0	[{'cast_id': 1, 'character': 'Allie Keys', 'cr...	[{'credit_id': '54177f100e0a2637ef0002e0', 'de...	2002	John Fawcett	male
34744	History	293603	en	The Roosevelts: An Intimate History	0.824032	Florentine Films	United States of America	2014-09-15	840.0	English	The Roosevelts: An Intimate History	8.3	6.0	[{'cast_id': 0, 'character': 'Eleanor Roosevel...	[{'credit_id': '541f65a70e0a261794000b7f', 'de...	2014	Ken Burns	male

	title	vote_count	vote_average	director_gender	director	popularity	year
33435	Wonder Woman	5025.0	7.2	female	Patty Jenkins	294.337037	2017
44338	The Bad Batch	160.0	5.3	female	Ana Lily Amirpour	78.807200	2017
43710	The Beguiled	279.0	5.8	female	Sofia Coppola	36.260510	2017
36298	Me Before You	2674.0	7.6	female	Thea Sharrock	34.347590	2016
2466	The Matrix	9079.0	7.9	female	Lana Wachowski	33.366332	1999
27559	Fifty Shades of Grey	3350.0	5.2	female	Sam Taylor-Johnson	33.068431	2015
13177	Twilight	3688.0	5.8	female	Catherine Hardwicke	31.282029	2008
23962	Jupiter Ascending	2816.0	5.2	female	Lilly Wachowski	21.463465	2015
41572	Underworld: Blood Wars	1619.0	5.2	female	Anna Foerster	20.029760	2016
8215	Shark Tale	1612.0	5.8	female	Vicky Jenson	17.999273	2004

	director_gender
male	19366
unknown	9955
female	1118

	director_gender
male	19366
unknown	9955
female	1118